Variávies utilizadas: séries, notas, temporadas, sequencia dos episódios
Importância das variáveis: 1) notas 2) sequencia de episódios 3) temporadas 4) séries
Removendo inconsistência
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# Lendo os dados
dados.series <- read_csv("../data/series_from_imdb.csv")
## Parsed with column specification:
## cols(
## series_name = col_character(),
## series_ep = col_integer(),
## season = col_integer(),
## season_ep = col_integer(),
## url = col_character(),
## Episode = col_character(),
## UserRating = col_double(),
## UserVotes = col_double(),
## r1 = col_double(),
## r10 = col_double(),
## r2 = col_double(),
## r3 = col_double(),
## r4 = col_double(),
## r5 = col_double(),
## r6 = col_double(),
## r7 = col_double(),
## r8 = col_double(),
## r9 = col_double()
## )
# Séries com mais de oito temporadas
series.mais.que.oito.temp <- dados.series %>%
filter(season > 8) %>%
select(series_name)
series.mais.que.oito.temp <- unlist(unique(series.mais.que.oito.temp))
# definindo operador not in
'%!in%' <- function(x,y)!('%in%'(x,y))
# Os mesmos dados de antes excluindo 4 séries que possuiam mais de 8 temporadas
dados.series <- dados.series %>%
filter(series_name %!in% series.mais.que.oito.temp)
summary(dados.series$season)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.103 4.000 8.000
# 75% das séries tem no máximo 5 temporadas
pop.series <- dados.series %>%
group_by(series_name) %>%
summarise(votes = sum(UserVotes))
top.series <- top_n(pop.series, 5, votes)
dados.final <- dados.series %>%
filter(series_name %in% top.series$series_name)
temps <- dados.final %>%
group_by(series_name) %>%
summarise(temps = max(season))
eps <- dados.final %>%
group_by(series_name) %>%
summarise(eps = max(series_ep))
# filter(eps <= 10)
plot
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(
dados.final,
x = ~ series_ep,
y = ~ UserRating,
type = 'scatter',
mode = 'lines',
color = ~ series_name
)
p <- economics %>%
tidyr::gather(variable, value, -date) %>%
transform(id = as.integer(factor(variable))) %>%
plot_ly(x = ~date, y = ~value, color = ~variable, colors = "Dark2",
yaxis = ~paste0("y", id)) %>%
add_lines() %>%
subplot(nrows = 5, shareX = TRUE)
# Usar um gráfico de áreas empilhadas seria melhor
# https://plot.ly/r/filled-area-plots/
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
# hc <- highchart() %>%
# hc_chart(type = "area") %>%
# hc_title(text = "Historic and Estimated Worldwide Population Distribution by Region") %>%
# hc_subtitle(text = "Source: Wikipedia.org") %>%
# hc_xAxis(categories = as.character(c(1:113)),
# tickmarkPlacement = "on",
# title = list(enabled = FALSE)) %>%
# hc_yAxis(title = list(text = "Percent")) %>%
# hc_tooltip(split = TRUE, valueSuffix = ' millions') %>%
# hc_plotOptions(area = list(
# stacking = "normal",
# lineColor = "#666666",
# lineWidth = 1,
# marker = list(
# lineWidth = 1,
# lineColor = "#666666"
# ))
# ) %>%
# hc_add_series(name = "Game of Thrones", game.of.thrones$UserRating) %>%
# hc_add_series(name = "Breaking bad", data = breaking.bad$UserRating) %>%
# hc_add_series(name = "Dexter", data = dexter$UserRating) %>%
# hc_add_series(name = "The Walking Dead", the.walking.dead$UserRating) %>%
# hc_add_series(name = "Arrow", data = arrow$UserRating)
#
# hc
dados.final <- dados.final %>%
mutate(notas.pessimas = round(r1 * UserVotes) +
round(r2 * UserVotes) +
round(r3 * UserVotes),
notas.ruins = round(r4 * UserVotes) + round(r5 * UserVotes),
notas.medianas = round(r6 * UserVotes) + round(r7 * UserVotes),
notas.boas = round(r8 * UserVotes) + round(r9 * UserVotes),
nota.maxima = round(r10 * UserVotes))
chart.data2 <- dados.final %>%
group_by(series_name) %>%
summarise(pessimas = sum(notas.pessimas) / sum(notas.ruins, notas.medianas, notas.boas, notas.pessimas, nota.maxima),
ruins = sum(notas.ruins) / sum(notas.ruins, notas.medianas, notas.boas, notas.pessimas, nota.maxima),
medianas = sum(notas.medianas) / sum(notas.ruins, notas.medianas, notas.boas, notas.pessimas, nota.maxima),
boas = sum(notas.boas) / sum(notas.ruins, notas.medianas, notas.boas, notas.pessimas, nota.maxima),
maxima = sum(nota.maxima) / sum(notas.ruins, notas.medianas, notas.boas, notas.pessimas, nota.maxima))
chart.data <- dados.final %>%
group_by(series_name) %>%
summarise(pessimas = sum(notas.pessimas),
ruins = sum(notas.ruins),
medianas = sum(notas.medianas),
boas = sum(notas.boas),
maxima = sum(nota.maxima))
breaking.bad <- dados.final %>% filter(series_name == "Breaking Bad")
dexter <- dados.final %>% filter(series_name == "Dexter")
arrow <- dados.final %>% filter(series_name == "Arrow")
game.of.thrones <- dados.final %>% filter(series_name == "Game of Thrones")
the.walking.dead <- dados.final %>% filter(series_name == "The Walking Dead")
hc2 <- highchart() %>%
hc_title(text = "Historic and Estimated Worldwide Population Distribution by Region") %>%
hc_subtitle(text = "Source: Wikipedia.org") %>%
hc_xAxis(categories = as.character(chart.data$series_name)) %>%
hc_series(list(type = "column",
name = "Péssimas",
data = chart.data$pessimas),
list(type = "column",
name = "Ruins",
data = chart.data$ruins),
list(type = "column",
name = "Medianas",
data = chart.data$medianas),
list(type = "column",
name = "Boas",
data = chart.data$boas),
list(type = "column",
name = "Máxima",
data = chart.data$maxima),
list(type = "spline",
name = "Média"))
hc2